#Creating Bonus_Challange
#Acquiring SRR accesions from the SRA run selector (input for fasterq-dump)
#Creating a bash script for automizing download of fastq files, renaming them, and quality control with fastqc and multiqc. Output of fastq files will be moved to fastqc_results folder.
#Using nano for making and editing the bash script
#The script
#Making the bash script executable and running it with nohup in the background. Cheking the process with cat nohup.out
#Result of the first script
#Moving the fastqc_results folder to computer with filezilla
#Per base quality scores of all samples seems normal meaning that there was no significant problem during sequencing
#Per sequence quality scores shows reads have high quality scores on average
#Per Sequence GC Content in mock_1 which is not a virus infected cell line shows abnormal GC content.
#Sequence Duplication Levels
#Over-represented sequences were present in both samples
#Over-represented sequences in Mock samples belonged to human genome mainly mitochondrial chormosome. This was checked by NCBI blast.
#Overrepresented sequences in RSV samples belonged to RSV virus infecting the cells.
#Overall the samples don’t require any trimming
#Mapping and calculating expressions using RSEM + quality control of mapping
#Copying The STAR Index folder from the previous analysis
#Using nano for making and editing the bash script
#The script
#Making the bash script executable and running it with nohup in the background. Cheking the process with cat nohup.out
#additional Script for postmapping quality check
#The mapping quality check will be moved to mapping_quality_results directory and we will download them with filezilla from the server.
#Post-Mapping Results:
#The higher percentage of unaligned sequences in RSV samples is to be expected
#Setting up the working directory and loading packages
getwd()
## [1] "/home/behrad/Desktop/Series 8"
setwd("/home/behrad/Desktop/Series 8/")
library(edgeR)
## Loading required package: limma
library("vidger")
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(openxlsx)
library(ggvenn)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: grid
## Loading required package: ggplot2
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#Reading data
Mock_1 <- read.table("expression_results/Series8_A549_Mock_1.genes.results", header = TRUE, sep ="\t", row.names = 1)
Mock_2 <- read.table("expression_results/Series8_A549_Mock_2.genes.results", header = TRUE, sep ="\t", row.names = 1)
Mock_3 <- read.table("expression_results/Series8_A549_Mock_3.genes.results", header = TRUE, sep ="\t", row.names = 1)
RSV_1 <- read.table("expression_results/Series8_A549_RSV_1.genes.results", header = TRUE, sep ="\t", row.names = 1)
RSV_2 <- read.table("expression_results/Series8_A549_RSV_2.genes.results", header = TRUE, sep ="\t", row.names = 1)
RSV_3 <- read.table("expression_results/Series8_A549_RSV_3.genes.results", header = TRUE, sep ="\t", row.names = 1)
#Creating expected counts table from RSEM expected counts of every sample
Expected_Counts_Table <- cbind(Mock_1[,4], Mock_2[,4], Mock_3[,4], RSV_1[,4], RSV_2[,4], RSV_3[,4])
colnames(Expected_Counts_Table) <- c("Mock_1", "Mock_2", "Mock_3", "RSV_1", "RSV_2", "RSV_3")
rownames(Expected_Counts_Table) <- row.names(Mock_1)
head(Expected_Counts_Table) #let's see what it looks like
## Mock_1 Mock_2 Mock_3 RSV_1 RSV_2 RSV_3
## A1BG 88.00 25.00 28.00 28.03 14.00 25.46
## A1BG-AS1 14.98 5.34 17.12 16.08 6.33 18.33
## A1CF 3.00 5.00 13.00 4.00 2.00 1.00
## A2M 1.00 1.00 2.00 4.00 2.00 2.00
## A2M-AS1 2.00 12.00 15.00 10.00 5.00 5.00
## A2ML1 0.00 0.00 0.00 0.00 0.00 0.00
#Creating edgeR DGEList object from expected counts table
DG <- DGEList(counts = Expected_Counts_Table, group = c(rep("Mock", 3), rep("RSV", 3)),
samples = colnames(Expected_Counts_Table), remove.zeros = TRUE)
## Removing 8909 rows with all zero counts
#Having a look
head(Expected_Counts_Table)
## Mock_1 Mock_2 Mock_3 RSV_1 RSV_2 RSV_3
## A1BG 88.00 25.00 28.00 28.03 14.00 25.46
## A1BG-AS1 14.98 5.34 17.12 16.08 6.33 18.33
## A1CF 3.00 5.00 13.00 4.00 2.00 1.00
## A2M 1.00 1.00 2.00 4.00 2.00 2.00
## A2M-AS1 2.00 12.00 15.00 10.00 5.00 5.00
## A2ML1 0.00 0.00 0.00 0.00 0.00 0.00
head(DG$counts)
## Mock_1 Mock_2 Mock_3 RSV_1 RSV_2 RSV_3
## A1BG 88.00 25.00 28.00 28.03 14.00 25.46
## A1BG-AS1 14.98 5.34 17.12 16.08 6.33 18.33
## A1CF 3.00 5.00 13.00 4.00 2.00 1.00
## A2M 1.00 1.00 2.00 4.00 2.00 2.00
## A2M-AS1 2.00 12.00 15.00 10.00 5.00 5.00
## A3GALT2 0.00 0.00 0.00 0.00 0.00 1.00
head(cpm(DG))
## Mock_1 Mock_2 Mock_3 RSV_1 RSV_2 RSV_3
## A1BG 18.4650273 4.8261747 3.1494986 2.2271203 2.1538769 2.24617399
## A1BG-AS1 3.1432512 1.0308709 1.9256934 1.2776345 0.9738601 1.61713941
## A1CF 0.6294896 0.9652349 1.4622672 0.3178195 0.3076967 0.08822364
## A2M 0.2098299 0.1930470 0.2249642 0.3178195 0.3076967 0.17644729
## A2M-AS1 0.4196597 2.3165639 1.6872314 0.7945488 0.7692418 0.44111822
## A3GALT2 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.08822364
dim(Expected_Counts_Table)
## [1] 28250 6
dim(cpm(DG))
## [1] 19341 6
#Removing genes with very low expression (keeping those with at least 10 cpm in at least 2 samples)
DG_keep <- rowSums(cpm(DG) > 10) >= 2
DG <- DG[DG_keep, ,keep.lib.sizes = FALSE]
dim(cpm(DG))
## [1] 10185 6
#Calculate scaling factors to normalize library sizes - TMM NORMALIZATION
DG$samples$norm.factors #Initial factors (all ones, we still need to normalize)
## [1] 1 1 1 1 1 1
head(cpm(DG)) #Non normalized CPM
## Mock_1 Mock_2 Mock_3 RSV_1 RSV_2 RSV_3
## A4GALT 24.0068086 14.884417 12.217498 11.661062 11.991300 13.485100
## AAAS 107.2870651 87.739720 86.093397 45.920457 49.055320 41.169743
## AACS 18.2706685 71.092675 67.481694 65.141103 65.562824 55.815813
## AADAC 4.6738919 12.730093 16.099694 22.437491 20.867977 22.772852
## AADACP1 1.0622482 5.092037 9.020396 10.696008 7.319365 10.359415
## AADAT 0.8497985 11.359160 11.532405 4.423161 6.073516 3.393601
DG <- calcNormFactors(DG) #We compute the scaling factors for each library
DG$samples$norm.factors #The scaling factors
## [1] 0.5235643 1.1989890 1.2078215 1.1078303 1.0743780 1.1081070
head(cpm(DG))
## Mock_1 Mock_2 Mock_3 RSV_1 RSV_2 RSV_3
## A4GALT 45.852650 12.414140 10.115318 10.526036 11.161156 12.169493
## AAAS 204.916712 73.178086 71.279901 41.450804 45.659273 37.153220
## AACS 34.896707 59.293851 55.870586 58.800615 61.023981 50.370418
## AADAC 8.927065 10.617356 13.329531 20.253545 19.423310 20.551131
## AADACP1 2.028878 4.246942 7.468319 9.654916 6.812653 9.348750
## AADAT 1.623103 9.473949 9.548103 3.992634 5.653053 3.062521
#Multidimensional scaling plot of samples
plotMDS(DG, cex = 0.6, col = c(rep("blue", 3), rep("red", 3)))
DG <- estimateDisp(DG)
## Using classic mode.
head(DG$tagwise.dispersion)
## [1] 0.27829500 0.19452484 0.09477444 0.06399553 0.14462380 0.25665422
plotBCV(DG)
#Perform test for differential expression between the two conditions.
Diff_expr_test <- exactTest(DG, c("Mock", "RSV"))
#Select genes with significant diff. expr. (FDR <= 0.01)
Diff_expr_test_sign <- topTags(Diff_expr_test, n = nrow(Diff_expr_test), p.value = 0.01)
#Having a look at the DEGs
head(Diff_expr_test_sign$table)
## logFC logCPM PValue FDR
## IFIT1 6.804468 7.905784 4.173923e-46 4.251141e-42
## MX1 6.494524 6.919100 3.444277e-45 1.753998e-41
## IFIT3 6.099768 7.291971 4.623207e-44 1.249133e-40
## IFIT2 7.340081 7.800481 4.905775e-44 1.249133e-40
## OASL 8.051167 7.188271 1.635817e-42 3.332159e-39
## IFI44 8.233885 4.173575 4.505796e-37 7.648589e-34
tail(Diff_expr_test_sign$table)
## logFC logCPM PValue FDR
## COQ2 -1.313791 4.324475 0.0006706830 0.009899864
## SPSB2 -2.503310 5.045460 0.0006725495 0.009913049
## CDC25C -1.510851 4.150944 0.0006743019 0.009924515
## FBF1 -2.014673 4.609588 0.0006769504 0.009945449
## LIFR 1.929748 4.001798 0.0006782455 0.009945449
## WDR62 -1.749110 6.146809 0.0006786536 0.009945449
#Number of upregulaated genes - that is genes that are more expressed in Infected w.r.t. Mock
sum(Diff_expr_test_sign$table$logFC > 0)
## [1] 434
#Number of downregulated genes - that is genes that are less expressed in Infected w.r.t. Mock
sum(Diff_expr_test_sign$table$logFC < 0)
## [1] 261
UP_regulated_genes <- row.names(Diff_expr_test_sign$table)[Diff_expr_test_sign$table$logFC > 0]
DOWN_regulated_genes <- row.names(Diff_expr_test_sign$table)[Diff_expr_test_sign$table$logFC < 0]
boxplot(cpm(DG)[UP_regulated_genes,], notch = TRUE, outline = FALSE, col = c(rep("blue", 3),
rep("red", 3)), ylab = "CPM", xlab = "Samples", main = "CPM Distr. UP Regulated genes in RSV infected A549 cells")
boxplot(cpm(DG)[DOWN_regulated_genes,], notch = TRUE, outline = FALSE, col = c(rep("blue", 3), rep("red", 3)), ylab = "CPM", xlab = "Samples", main = "CPM Distr. DOWN Regulated genes in RSV infected A549 cells")
colors <- ifelse(Diff_expr_test$table$PValue <= 0.01, ifelse(Diff_expr_test$table$logFC > 0, "red", "blue"), "black")
plot(Diff_expr_test$table$logFC, -log10(Diff_expr_test$table$PValue), cex = 0.1, ylim = c(0,50), ylab = "-log10 Pvalue", xlab = "LogFC", main = "Vulcano Plot", col = colors)
#Scatterplot
vsScatterPlot(x = "Mock", y = "RSV", data = DG, type = "edger")
#Volcano plot i.e. a scatterplot of logFC vs FDR (adh pv)
vsVolcano(x = "Mock", y = "RSV", data = DG, type = "edger", padj = 0.01)
## Warning in eval(quote(list(...)), env): NAs introduced by coercion
#MA plot i.e. a scatter plot of expression vs logFC
vsMAPlot(x = "Mock", y = "RSV", data = DG, type = "edger", padj = 0.01)
## Warning in eval(quote(list(...)), env): NAs introduced by coercion
#Saving all the results into an excel file
wb <- createWorkbook()
addWorksheet(wb, "Expected Counts")
addWorksheet(wb, "CPM")
addWorksheet(wb, "Diff.Expr.All.Genes")
addWorksheet(wb, "Diff.Expr.SIGN")
writeData(wb = wb, sheet = "Expected Counts", x = Expected_Counts_Table, rowNames = TRUE)
writeData(wb = wb, sheet = "CPM", x = cpm(DG), rowNames = TRUE)
writeData(wb = wb, sheet = "Diff.Expr.All.Genes", x = Diff_expr_test$table, rowNames = TRUE)
writeData(wb = wb, sheet = "Diff.Expr.SIGN", x = Diff_expr_test_sign$table, rowNames = TRUE)
saveWorkbook(wb, "Diff_expr_analysis_series_8.xlsx", overwrite = TRUE)
#Loading differentailly expressed genes from Series 6 and Series 8
Series_6 <- read.xlsx("Diff_expr_analysis_series_6.xlsx", sheet = "Diff.Expr.SIGN", rowNames = T)
Series_8 <- read.xlsx("Diff_expr_analysis_series_8.xlsx", sheet = "Diff.Expr.SIGN", rowNames = T)
#Seperating Up and Down regulated genes from both series
Up_reg_S6 <- Series_6 %>%
filter(logFC > 0) %>%
row.names()
Up_reg_S8 <- Series_8 %>%
filter(logFC > 0)%>%
row.names()
Down_reg_S6 <- Series_6 %>%
filter(logFC < 0)%>%
row.names()
Down_reg_S8<- Series_8 %>%
filter(logFC < 0)%>%
row.names()
#Making the list for the Venn Diagram and Drawing the plot for up-regulated genes
venn_list <- list("Series 6" = Up_reg_S6, "Series 8" = Up_reg_S8)
ggvenn(data = venn_list,
fill_color = c("skyblue", "lightpink"))
#Making the list for the Venn Diagram and Drawing the plot for
Down-regulated genes
venn_list_Down <- list("Series 6" = Down_reg_S6, "Series 8" = Down_reg_S8)
ggvenn(data = venn_list_Down,
fill_color = c("skyblue", "lightpink"))
# The End